/*
   Copyright (C) 2009 Red Hat, Inc.

   This software is licensed under the GNU General Public License,
   version 2 (GPLv2) (see COPYING for details), subject to the
   following clarification.

   With respect to binaries built using the Microsoft(R) Windows
   Driver Kit (WDK), GPLv2 does not extend to any code contained in or
   derived from the WDK ("WDK Code").  As to WDK Code, by using or
   distributing such binaries you agree to be bound by the Microsoft
   Software License Terms for the WDK.  All WDK Code is considered by
   the GPLv2 licensors to qualify for the special exception stated in
   section 3 of GPLv2 (commonly known as the system library
   exception).

   There is NO WARRANTY for this software, express or implied,
   including the implied warranties of NON-INFRINGEMENT, TITLE,
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*/

// Red Hat image compression based on SFALIC by Roman Starosolski
// http://sun.iinf.polsl.gliwice.pl/~rstaros/sfalic/index.html

#include "stddef.h"

#include <stdarg.h>
#include <stdlib.h>
#include <stdio.h>
#include "os_dep.h"

#include "winerror.h"
#include "windef.h"
#include "wingdi.h"
#include "winddi.h"
#include "devioctl.h"
#include "ntddvdeo.h"

#include "qxldd.h"
#include "utils.h"
#include "mspace.h"
#include "res.h"
#include "surface.h"


#include "quic.h"

//#define DEBUG

#define RLE
#define RLE_STAT
#define PRED_1
//#define RLE_PRED_1
#define RLE_PRED_2
//#define RLE_PRED_3
#define QUIC_RGB

#define QUIC_MAGIC (*(uint32_t *)"QUIC")
#define QUIC_VERSION_MAJOR 0U
#define QUIC_VERSION_MINOR 1U
#define QUIC_VERSION ((QUIC_VERSION_MAJOR << 16) | (QUIC_VERSION_MAJOR & 0xffff))

#ifdef ASSERT
#undef ASSERT
#endif

#ifdef DEBUG

#define ASSERT(usr, x) \
    if (!(x)) (usr)->error(usr, "%s: ASSERT %s failed\n", __FUNCTION__, #x);

#else

#define ASSERT(usr, x)

#endif

#define FALSE 0
#define TRUE 1

typedef uint8_t BYTE;

/* maximum number of codes in family */
#define MAXNUMCODES 8

/* model evolution, warning: only 1,3 and 5 allowed */
#define DEFevol 3
#define MINevol 0
#define MAXevol 5

/* starting wait mask index */
#define DEFwmistart 0
#define MINwmistart 0

/* codeword length limit */
#define DEFmaxclen 26

/* target wait mask index */
#define DEFwmimax 6

/* number of symbols to encode before increasing wait mask index */
#define DEFwminext 2048
#define MINwminext 1
#define MAXwminext 100000000

typedef struct QuicFamily {
    unsigned int nGRcodewords[MAXNUMCODES];      /* indexed by code number, contains number of
                                                    unmodofied GR codewords in the code */
    unsigned int notGRcwlen[MAXNUMCODES];        /* indexed by code number, contains codeword
                                                    length of the not-GR codeword */
    unsigned int notGRprefixmask[MAXNUMCODES];   /* indexed by code number, contains mask to
                                                    determine if the codeword is GR or not-GR */
    unsigned int notGRsuffixlen[MAXNUMCODES];    /* indexed by code number, contains suffix
                                                    length of the not-GR codeword */

    /* array for translating distribution U to L for depths up to 8 bpp,
    initialized by decorelateinit() */
    BYTE xlatU2L[256];

    /* array for translating distribution L to U for depths up to 8 bpp,
       initialized by corelateinit() */
    unsigned int xlatL2U[256];
} QuicFamily;

static QuicFamily family_8bpc;
static QuicFamily family_5bpc;

typedef unsigned COUNTER;   /* counter in the array of counters in bucket of the data model */

typedef struct s_bucket {
    COUNTER *pcounters;     /* pointer to array of counters */
    unsigned int bestcode;  /* best code so far */
} s_bucket;

typedef struct Encoder Encoder;

typedef struct CommonState {
    Encoder *encoder;

    unsigned int waitcnt;
    unsigned int tabrand_seed;
    unsigned int wm_trigger;
    unsigned int wmidx;
    unsigned int wmileft;

#ifdef RLE_STAT
    int melcstate; /* index to the state array */

    int melclen;  /* contents of the state array location
                     indexed by melcstate: the "expected"
                     run length is 2^melclen, shorter runs are
                     encoded by a 1 followed by the run length
                     in binary representation, wit a fixed length
                     of melclen bits */

    unsigned long melcorder;  /* 2^ melclen */
#endif
} CommonState;


#define MAX_CHANNELS 4

typedef struct FamilyStat {
    s_bucket **buckets_ptrs;
    s_bucket *buckets_buf;
    COUNTER *counters;
} FamilyStat;

typedef struct Channel {
    Encoder *encoder;

    int correlate_row_width;
    BYTE *correlate_row;

    s_bucket **_buckets_ptrs;

    FamilyStat family_stat_8bpc;
    FamilyStat family_stat_5bpc;

    CommonState state;
} Channel;

struct Encoder {
    QuicUsrContext *usr;
    QuicImageType type;
    unsigned int width;
    unsigned int height;
    unsigned int num_channels;

    unsigned int n_buckets_8bpc;
    unsigned int n_buckets_5bpc;

    unsigned int io_available_bits;
    uint32_t io_word;
    uint32_t io_next_word;
    uint32_t *io_now;
    uint32_t *io_end;
    uint32_t io_words_count;

    int rows_completed;

    Channel channels[MAX_CHANNELS];

    CommonState rgb_state;
};

/* target wait mask index */
static int wmimax = DEFwmimax;

/* number of symbols to encode before increasing wait mask index */
static int wminext = DEFwminext;

/* model evolution mode */
static int evol = DEFevol;

/* bppmask[i] contains i ones as lsb-s */
static const unsigned long int bppmask[33] = {
    0x00000000, /* [0] */
    0x00000001, 0x00000003, 0x00000007, 0x0000000f,
    0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
    0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
    0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff,
    0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff,
    0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
    0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff,
    0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff /* [32] */
};

static const unsigned int bitat[32] = {
    0x00000001, 0x00000002, 0x00000004, 0x00000008,
    0x00000010, 0x00000020, 0x00000040, 0x00000080,
    0x00000100, 0x00000200, 0x00000400, 0x00000800,
    0x00001000, 0x00002000, 0x00004000, 0x00008000,
    0x00010000, 0x00020000, 0x00040000, 0x00080000,
    0x00100000, 0x00200000, 0x00400000, 0x00800000,
    0x01000000, 0x02000000, 0x04000000, 0x08000000,
    0x10000000, 0x20000000, 0x40000000, 0x80000000 /* [31]*/
};


#define TABRAND_TABSIZE 256
#define TABRAND_SEEDMASK 0x0ff

static const unsigned int tabrand_chaos[TABRAND_TABSIZE] = {
    0x02c57542, 0x35427717, 0x2f5a2153, 0x9244f155, 0x7bd26d07, 0x354c6052, 0x57329b28, 0x2993868e,
    0x6cd8808c, 0x147b46e0, 0x99db66af, 0xe32b4cac, 0x1b671264, 0x9d433486, 0x62a4c192, 0x06089a4b,
    0x9e3dce44, 0xdaabee13, 0x222425ea, 0xa46f331d, 0xcd589250, 0x8bb81d7f, 0xc8b736b9, 0x35948d33,
    0xd7ac7fd0, 0x5fbe2803, 0x2cfbc105, 0x013dbc4e, 0x7a37820f, 0x39f88e9e, 0xedd58794, 0xc5076689,
    0xfcada5a4, 0x64c2f46d, 0xb3ba3243, 0x8974b4f9, 0x5a05aebd, 0x20afcd00, 0x39e2b008, 0x88a18a45,
    0x600bde29, 0xf3971ace, 0xf37b0a6b, 0x7041495b, 0x70b707ab, 0x06beffbb, 0x4206051f, 0xe13c4ee3,
    0xc1a78327, 0x91aa067c, 0x8295f72a, 0x732917a6, 0x1d871b4d, 0x4048f136, 0xf1840e7e, 0x6a6048c1,
    0x696cb71a, 0x7ff501c3, 0x0fc6310b, 0x57e0f83d, 0x8cc26e74, 0x11a525a2, 0x946934c7, 0x7cd888f0,
    0x8f9d8604, 0x4f86e73b, 0x04520316, 0xdeeea20c, 0xf1def496, 0x67687288, 0xf540c5b2, 0x22401484,
    0x3478658a, 0xc2385746, 0x01979c2c, 0x5dad73c8, 0x0321f58b, 0xf0fedbee, 0x92826ddf, 0x284bec73,
    0x5b1a1975, 0x03df1e11, 0x20963e01, 0xa17cf12b, 0x740d776e, 0xa7a6bf3c, 0x01b5cce4, 0x1118aa76,
    0xfc6fac0a, 0xce927e9b, 0x00bf2567, 0x806f216c, 0xbca69056, 0x795bd3e9, 0xc9dc4557, 0x8929b6c2,
    0x789d52ec, 0x3f3fbf40, 0xb9197368, 0xa38c15b5, 0xc3b44fa8, 0xca8333b0, 0xb7e8d590, 0xbe807feb,
    0xbf5f8360, 0xd99e2f5c, 0x372928e1, 0x7c757c4c, 0x0db5b154, 0xc01ede02, 0x1fc86e78, 0x1f3985be,
    0xb4805c77, 0x00c880fa, 0x974c1b12, 0x35ab0214, 0xb2dc840d, 0x5b00ae37, 0xd313b026, 0xb260969d,
    0x7f4c8879, 0x1734c4d3, 0x49068631, 0xb9f6a021, 0x6b863e6f, 0xcee5debf, 0x29f8c9fb, 0x53dd6880,
    0x72b61223, 0x1f67a9fd, 0x0a0f6993, 0x13e59119, 0x11cca12e, 0xfe6b6766, 0x16b6effc, 0x97918fc4,
    0xc2b8a563, 0x94f2f741, 0x0bfa8c9a, 0xd1537ae8, 0xc1da349c, 0x873c60ca, 0x95005b85, 0x9b5c080e,
    0xbc8abbd9, 0xe1eab1d2, 0x6dac9070, 0x4ea9ebf1, 0xe0cf30d4, 0x1ef5bd7b, 0xd161043e, 0x5d2fa2e2,
    0xff5d3cae, 0x86ed9f87, 0x2aa1daa1, 0xbd731a34, 0x9e8f4b22, 0xb1c2c67a, 0xc21758c9, 0xa182215d,
    0xccb01948, 0x8d168df7, 0x04238cfe, 0x368c3dbc, 0x0aeadca5, 0xbad21c24, 0x0a71fee5, 0x9fc5d872,
    0x54c152c6, 0xfc329483, 0x6783384a, 0xeddb3e1c, 0x65f90e30, 0x884ad098, 0xce81675a, 0x4b372f7d,
    0x68bf9a39, 0x43445f1e, 0x40f8d8cb, 0x90d5acb6, 0x4cd07282, 0x349eeb06, 0x0c9d5332, 0x520b24ef,
    0x80020447, 0x67976491, 0x2f931ca3, 0xfe9b0535, 0xfcd30220, 0x61a9e6cc, 0xa487d8d7, 0x3f7c5dd1,
    0x7d0127c5, 0x48f51d15, 0x60dea871, 0xc9a91cb7, 0x58b53bb3, 0x9d5e0b2d, 0x624a78b4, 0x30dbee1b,
    0x9bdf22e7, 0x1df5c299, 0x2d5643a7, 0xf4dd35ff, 0x03ca8fd6, 0x53b47ed8, 0x6f2c19aa, 0xfeb0c1f4,
    0x49e54438, 0x2f2577e6, 0xbf876969, 0x72440ea9, 0xfa0bafb8, 0x74f5b3a0, 0x7dd357cd, 0x89ce1358,
    0x6ef2cdda, 0x1e7767f3, 0xa6be9fdb, 0x4f5f88f8, 0xba994a3a, 0x08ca6b65, 0xe0893818, 0x9e00a16a,
    0xf42bfc8f, 0x9972eedc, 0x749c8b51, 0x32c05f5e, 0xd706805f, 0x6bfbb7cf, 0xd9210a10, 0x31a1db97,
    0x923a9559, 0x37a7a1f6, 0x059f8861, 0xca493e62, 0x65157e81, 0x8f6467dd, 0xab85ff9f, 0x9331aff2,
    0x8616b9f5, 0xedbd5695, 0xee7e29b1, 0x313ac44f, 0xb903112f, 0x432ef649, 0xdc0a36c0, 0x61cf2bba,
    0x81474925, 0xa8b6c7ad, 0xee5931de, 0xb2f8158d, 0x59fb7409, 0x2e3dfaed, 0x9af25a3f, 0xe1fed4d5,
};

static unsigned int stabrand()
{
    //ASSERT( !(TABRAND_SEEDMASK & TABRAND_TABSIZE));
    //ASSERT( TABRAND_SEEDMASK + 1 == TABRAND_TABSIZE );

    return TABRAND_SEEDMASK;
}

static unsigned int tabrand(unsigned int *tabrand_seed)
{
    return tabrand_chaos[++*tabrand_seed & TABRAND_SEEDMASK];
}

static const unsigned short besttrigtab[3][11] = { /* array of wm_trigger for waitmask and evol,
                                                    used by set_wm_trigger() */
    /* 1 */ { 550, 900, 800, 700, 500, 350, 300, 200, 180, 180, 160},
    /* 3 */ { 110, 550, 900, 800, 550, 400, 350, 250, 140, 160, 140},
    /* 5 */ { 100, 120, 550, 900, 700, 500, 400, 300, 220, 250, 160}
};

/* set wm_trigger knowing waitmask (param) and evol (glob)*/
static void set_wm_trigger(CommonState *state)
{
    unsigned int wm = state->wmidx;
    if (wm > 10) {
        wm = 10;
    }

    ASSERT(state->encoder->usr, evol < 6);

    state->wm_trigger = besttrigtab[evol / 2][wm];

    ASSERT(state->encoder->usr, state->wm_trigger <= 2000);
    ASSERT(state->encoder->usr, state->wm_trigger >= 1);
}

static int ceil_log_2(int val) /* ceil(log_2(val)) */
{
    int result;

    //ASSERT(val>0);

    if (val == 1) {
        return 0;
    }

    result = 1;
    val -= 1;
    while (val >>= 1) {
        result++;
    }

    return result;
}

/* number of leading zeroes in the byte, used by cntlzeroes(uint)*/
static const BYTE lzeroes[256] = {
    8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};

/* count leading zeroes */
static unsigned int cnt_l_zeroes(const unsigned int bits)
{
    if (bits & 0xff800000) {
        return lzeroes[bits >> 24];
    } else if (bits & 0xffff8000) {
        return 8 + lzeroes[(bits >> 16) & 0x000000ff];
    } else if (bits & 0xffffff80) {
        return 16 + lzeroes[(bits >> 8) & 0x000000ff];
    } else {
        return 24 + lzeroes[bits & 0x000000ff];
    }
}

#define QUIC_FAMILY_8BPC
#include "quic_family_tmpl.c"

#ifdef QUIC_RGB
#define QUIC_FAMILY_5BPC
#include "quic_family_tmpl.c"
#endif

static void decorelate_init(QuicFamily *family, int bpc)
{
    const unsigned int pixelbitmask = bppmask[bpc];
    const unsigned int pixelbitmaskshr = pixelbitmask >> 1;
    unsigned int s;

    //ASSERT(bpc <= 8);

    for (s = 0; s <= pixelbitmask; s++) {
        if (s <= pixelbitmaskshr) {
            family->xlatU2L[s] = s << 1;
        } else {
            family->xlatU2L[s] = ((pixelbitmask - s) << 1) + 1;
        }
    }
}

static void corelate_init(QuicFamily *family, int bpc)
{
    const unsigned long int pixelbitmask = bppmask[bpc];
    unsigned long int s;

    //ASSERT(bpc <= 8);

    for (s = 0; s <= pixelbitmask; s++) {
        if (s & 0x01) {
            family->xlatL2U[s] = pixelbitmask - (s >> 1);
        } else {
            family->xlatL2U[s] = (s >> 1);
        }
    }
}

static void family_init(QuicFamily *family, int bpc, int limit)
{
    int l;

    for (l = 0; l < bpc; l++) { /* fill arrays indexed by code number */
        int altprefixlen, altcodewords;

        altprefixlen = limit - bpc;
        if (altprefixlen > (int)(bppmask[bpc - l])) {
            altprefixlen = bppmask[bpc - l];
        }

        altcodewords = bppmask[bpc] + 1 - (altprefixlen << l);

        family->nGRcodewords[l] = (altprefixlen << l);
        family->notGRcwlen[l] = altprefixlen + ceil_log_2(altcodewords);
        family->notGRprefixmask[l] = bppmask[32 - altprefixlen]; /* needed for decoding only */
        family->notGRsuffixlen[l] = ceil_log_2(altcodewords); /* needed for decoding only */
    }

    decorelate_init(family, bpc);
    corelate_init(family, bpc);
}

static void more_io_words(Encoder *encoder)
{
    uint32_t *io_ptr;
    int num_io_words = encoder->usr->more_space(encoder->usr, &io_ptr, encoder->rows_completed);
    if (num_io_words <= 0) {
        encoder->usr->error(encoder->usr, "%s: no more words\n", __FUNCTION__);
    }
    ASSERT(encoder->usr, io_ptr);
    encoder->io_words_count += num_io_words;
    encoder->io_now = io_ptr;
    encoder->io_end = encoder->io_now + num_io_words;
}

static void __write_io_word(Encoder *encoder)
{
    more_io_words(encoder);
    *(encoder->io_now++) = encoder->io_word;
}

static void (*__write_io_word_ptr)(Encoder *encoder) = __write_io_word;

static INLINE void write_io_word(Encoder *encoder)
{
    if (encoder->io_now == encoder->io_end) {
        __write_io_word_ptr(encoder); //disable inline optimizations
        return;
    }
    *(encoder->io_now++) = encoder->io_word;
}

static INLINE void encode(Encoder *encoder, unsigned int word, unsigned int len)
{
    int delta;

    ASSERT(encoder->usr, len > 0 && len < 32);
    ASSERT(encoder->usr, !(word & ~bppmask[len]));
    if ((delta = ((int)encoder->io_available_bits - len)) >= 0) {
        encoder->io_available_bits = delta;
        encoder->io_word |= word << encoder->io_available_bits;
        return;
    }
    delta = -delta;
    encoder->io_word |= word >> delta;
    write_io_word(encoder);
    encoder->io_available_bits = 32 - delta;
    encoder->io_word = word << encoder->io_available_bits;

    ASSERT(encoder->usr, encoder->io_available_bits < 32);
    ASSERT(encoder->usr, (encoder->io_word & bppmask[encoder->io_available_bits]) == 0);
}

static INLINE void encode_32(Encoder *encoder, unsigned int word)
{
    encode(encoder, word >> 16, 16);
    encode(encoder, word & 0x0000ffff, 16);
}

static INLINE void flush(Encoder *encoder)
{
    if (encoder->io_available_bits > 0 && encoder->io_available_bits != 32) {
        encode(encoder, 0, encoder->io_available_bits);
    }
    encode_32(encoder, 0);
    encode(encoder, 0, 1);
}

static void __read_io_word(Encoder *encoder)
{
    more_io_words(encoder);
    encoder->io_next_word = *(encoder->io_now++);
}

static void (*__read_io_word_ptr)(Encoder *encoder) = __read_io_word;


static INLINE void read_io_word(Encoder *encoder)
{
    if (encoder->io_now == encoder->io_end) {
        __read_io_word_ptr(encoder); //disable inline optimizations
        return;
    }
    ASSERT(encoder->usr, encoder->io_now < encoder->io_end);
    encoder->io_next_word = *(encoder->io_now++);
}

static INLINE void decode_eatbits(Encoder *encoder, int len)
{
    int delta;

    ASSERT(encoder->usr, len > 0 && len < 32);
    encoder->io_word <<= len;

    if ((delta = ((int)encoder->io_available_bits - len)) >= 0) {
        encoder->io_available_bits = delta;
        encoder->io_word |= encoder->io_next_word >> encoder->io_available_bits;
        return;
    }

    delta = -delta;
    encoder->io_word |= encoder->io_next_word << delta;
    read_io_word(encoder);
    encoder->io_available_bits = 32 - delta;
    encoder->io_word |= (encoder->io_next_word >> encoder->io_available_bits);
}

static INLINE void decode_eat32bits(Encoder *encoder)
{
    decode_eatbits(encoder, 16);
    decode_eatbits(encoder, 16);
}

#ifdef RLE

#ifdef RLE_STAT

static INLINE void encode_ones(Encoder *encoder, unsigned int n)
{
    unsigned int count;

    for (count = n >> 5; count; count--) {
        encode(encoder, ~0U, 32);
    }

    if ((n &= 0x1f)) {
        encode(encoder, (1U << n) - 1, n);
    }
}

#define MELCSTATES 32 /* number of melcode states */

static int zeroLUT[256]; /* table to find out number of leading zeros */

static int J[MELCSTATES] = {
    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7,
    7, 8, 9, 10, 11, 12, 13, 14, 15
};

/* creates the bit counting look-up table. */
static void init_zeroLUT()
{
    int i, j, k, l;

    j = k = 1;
    l = 8;
    for (i = 0; i < 256; ++i) {
        zeroLUT[i] = l;
        --k;
        if (k == 0) {
            k = j;
            --l;
            j *= 2;
        }
    }
}

static void encoder_init_rle(CommonState *state)
{
    state->melcstate = 0;
    state->melclen = J[0];
    state->melcorder = 1 << state->melclen;
}

#ifdef QUIC_RGB

static void encode_run(Encoder *encoder, unsigned int runlen) //todo: try use end of line
{
    int hits = 0;

    while (runlen >= encoder->rgb_state.melcorder) {
        hits++;
        runlen -= encoder->rgb_state.melcorder;
        if (encoder->rgb_state.melcstate < MELCSTATES) {
            encoder->rgb_state.melclen = J[++encoder->rgb_state.melcstate];
            encoder->rgb_state.melcorder = (1L << encoder->rgb_state.melclen);
        }
    }

    /* send the required number of "hit" bits (one per occurrence
       of a run of length melcorder). This number is never too big:
       after 31 such "hit" bits, each "hit" would represent a run of 32K
       pixels.
    */
    encode_ones(encoder, hits);

    encode(encoder, runlen, encoder->rgb_state.melclen + 1);

    /* adjust melcoder parameters */
    if (encoder->rgb_state.melcstate) {
        encoder->rgb_state.melclen = J[--encoder->rgb_state.melcstate];
        encoder->rgb_state.melcorder = (1L << encoder->rgb_state.melclen);
    }
}

#endif

static void encode_channel_run(Encoder *encoder, Channel *channel, unsigned int runlen)
{
    //todo: try use end of line
    int hits = 0;

    while (runlen >= channel->state.melcorder) {
        hits++;
        runlen -= channel->state.melcorder;
        if (channel->state.melcstate < MELCSTATES) {
            channel->state.melclen = J[++channel->state.melcstate];
            channel->state.melcorder = (1L << channel->state.melclen);
        }
    }

    /* send the required number of "hit" bits (one per occurrence
       of a run of length melcorder). This number is never too big:
       after 31 such "hit" bits, each "hit" would represent a run of 32K
       pixels.
    */
    encode_ones(encoder, hits);

    encode(encoder, runlen, channel->state.melclen + 1);

    /* adjust melcoder parameters */
    if (channel->state.melcstate) {
        channel->state.melclen = J[--channel->state.melcstate];
        channel->state.melcorder = (1L << channel->state.melclen);
    }
}

/* decoding routine: reads bits from the input and returns a run length. */
/* argument is the number of pixels left to end-of-line (bound on run length) */

#ifdef QUIC_RGB
static int decode_run(Encoder *encoder)
{
    int runlen = 0;

    do {
        register int temp, hits;
        temp = zeroLUT[(BYTE)(~(encoder->io_word >> 24))];/* number of leading ones in the
                                                                      input stream, up to 8 */
        for (hits = 1; hits <= temp; hits++) {
            runlen += encoder->rgb_state.melcorder;

            if (encoder->rgb_state.melcstate < MELCSTATES) {
                encoder->rgb_state.melclen = J[++encoder->rgb_state.melcstate];
                encoder->rgb_state.melcorder = (1U << encoder->rgb_state.melclen);
            }
        }
        if (temp != 8) {
            decode_eatbits(encoder, temp + 1);  /* consume the leading
                                                            0 of the remainder encoding */
            break;
        }
        decode_eatbits(encoder, 8);
    } while (1);

    /* read the length of the remainder */
    if (encoder->rgb_state.melclen) {
        runlen += encoder->io_word >> (32 - encoder->rgb_state.melclen);
        decode_eatbits(encoder, encoder->rgb_state.melclen);
    }

    /* adjust melcoder parameters */
    if (encoder->rgb_state.melcstate) {
        encoder->rgb_state.melclen = J[--encoder->rgb_state.melcstate];
        encoder->rgb_state.melcorder = (1U << encoder->rgb_state.melclen);
    }

    return runlen;
}

#endif

static int decode_channel_run(Encoder *encoder, Channel *channel)
{
    int runlen = 0;

    do {
        register int temp, hits;
        temp = zeroLUT[(BYTE)(~(encoder->io_word >> 24))];/* number of leading ones in the
                                                                      input stream, up to 8 */
        for (hits = 1; hits <= temp; hits++) {
            runlen += channel->state.melcorder;

            if (channel->state.melcstate < MELCSTATES) {
                channel->state.melclen = J[++channel->state.melcstate];
                channel->state.melcorder = (1U << channel->state.melclen);
            }
        }
        if (temp != 8) {
            decode_eatbits(encoder, temp + 1);  /* consume the leading
                                                            0 of the remainder encoding */
            break;
        }
        decode_eatbits(encoder, 8);
    } while (1);

    /* read the length of the remainder */
    if (channel->state.melclen) {
        runlen += encoder->io_word >> (32 - channel->state.melclen);
        decode_eatbits(encoder, channel->state.melclen);
    }

    /* adjust melcoder parameters */
    if (channel->state.melcstate) {
        channel->state.melclen = J[--channel->state.melcstate];
        channel->state.melcorder = (1U << channel->state.melclen);
    }

    return runlen;
}

#else

static INLINE int find_msb(int x)
{
    int r;

    __asm__("bsrl %1,%0\n\t"
            "jnz 1f\n\t"
            "movl $-1,%0\n"
            "1:" : "=r" (r) : "rm" (x));
    return r + 1;
}

static INLINE void encode_run(Encoder *encoder, unsigned int len)
{
    int odd = len & 1U;
    int msb;

    len &= ~1U;

    while ((msb = find_msb(len))) {
        len &= ~(1 << (msb - 1));
        ASSERT(encoder->usr, msb < 32);
        encode(encoder, (1 << (msb)) - 1, msb);
        encode(encoder, 0, 1);
    }

    if (odd) {
        encode(encoder, 2, 2);
    } else {
        encode(encoder, 0, 1);
    }
}

static INLINE unsigned int decode_run(Encoder *encoder)
{
    unsigned int len = 0;
    int count;

    do {
        count = 0;
        while (encoder->io_word & (1U << 31)) {
            decode_eatbits(encoder, 1);
            count++;
            ASSERT(encoder->usr, count < 32);
        }
        decode_eatbits(encoder, 1);
        len += (1U << count) >> 1;
    } while (count > 1);

    return len;
}

#endif
#endif

static INLINE void init_decode_io(Encoder *encoder)
{
    encoder->io_next_word = encoder->io_word = *(encoder->io_now++);
    encoder->io_available_bits = 0;
}

#ifdef __GNUC__
#define ATTR_PACKED __attribute__ ((__packed__))
#else
#define ATTR_PACKED
#pragma pack(push)
#pragma pack(1)
#endif

typedef struct ATTR_PACKED one_byte_pixel_t {
    BYTE a;
} one_byte_t;

typedef struct ATTR_PACKED three_bytes_pixel_t {
    BYTE a;
    BYTE b;
    BYTE c;
} three_bytes_t;

typedef struct ATTR_PACKED four_bytes_pixel_t {
    BYTE a;
    BYTE b;
    BYTE c;
    BYTE d;
} four_bytes_t;

typedef struct ATTR_PACKED rgb32_pixel_t {
    BYTE b;
    BYTE g;
    BYTE r;
    BYTE pad;
} rgb32_pixel_t;

typedef struct ATTR_PACKED rgb24_pixel_t {
    BYTE b;
    BYTE g;
    BYTE r;
} rgb24_pixel_t;

typedef uint16_t rgb16_pixel_t;

#ifndef __GNUC__
#pragma pack(pop)
#endif

#undef ATTR_PACKED

#define ONE_BYTE
#include "quic_tmpl.c"

#define FOUR_BYTE
#include "quic_tmpl.c"

#ifdef QUIC_RGB

#define QUIC_RGB32
#include "quic_rgb_tmpl.c"

#define QUIC_RGB24
#include "quic_rgb_tmpl.c"

#define QUIC_RGB16
#include "quic_rgb_tmpl.c"

#define QUIC_RGB16_TO_32
#include "quic_rgb_tmpl.c"

#else

#define THREE_BYTE
#include "quic_tmpl.c"

#endif

static void fill_model_structures(Encoder *encoder, FamilyStat *family_stat,
                                  unsigned int rep_first, unsigned int first_size,
                                  unsigned int rep_next, unsigned int mul_size,
                                  unsigned int levels, unsigned int ncounters,
                                  unsigned int nbuckets, unsigned int n_buckets_ptrs)
{
    unsigned int
    bsize,
    bstart,
    bend = 0,
    repcntr,
    bnumber;

    COUNTER * free_counter = family_stat->counters;/* first free location in the array of
                                                      counters */

    bnumber = 0;

    repcntr = rep_first + 1;    /* first bucket */
    bsize = first_size;

    do { /* others */
        if (bnumber) {
            bstart = bend + 1;
        } else {
            bstart = 0;
        }

        if (!--repcntr) {
            repcntr = rep_next;
            bsize *= mul_size;
        }

        bend = bstart + bsize - 1;
        if (bend + bsize >= levels) {
            bend = levels - 1;
        }

        family_stat->buckets_buf[bnumber].pcounters = free_counter;
        free_counter += ncounters;

        ASSERT(encoder->usr, bstart < n_buckets_ptrs);
        {
            unsigned int i;
            ASSERT(encoder->usr, bend < n_buckets_ptrs);
            for (i = bstart; i <= bend; i++) {
                family_stat->buckets_ptrs[i] = family_stat->buckets_buf + bnumber;
            }
        }

        bnumber++;
    } while (bend < levels - 1);

    ASSERT(encoder->usr, free_counter - family_stat->counters == nbuckets * ncounters);
}

static void find_model_params(Encoder *encoder,
                              const int bpc,
                              unsigned int *ncounters,
                              unsigned int *levels,
                              unsigned int *n_buckets_ptrs,
                              unsigned int *repfirst,
                              unsigned int *firstsize,
                              unsigned int *repnext,
                              unsigned int *mulsize,
                              unsigned int *nbuckets)
{
    unsigned int bsize;              /* bucket size */
    unsigned int bstart, bend = 0;   /* bucket start and end, range : 0 to levels-1*/
    unsigned int repcntr;            /* helper */

    ASSERT(encoder->usr, bpc <= 8 && bpc > 0);


    *ncounters = 8;

    *levels = 0x1 << bpc;

    *n_buckets_ptrs = 0;  /* ==0 means: not set yet */

    switch (evol) {   /* set repfirst firstsize repnext mulsize */
    case 1: /* buckets contain following numbers of contexts: 1 1 1 2 2 4 4 8 8 ... */
        *repfirst = 3;
        *firstsize = 1;
        *repnext = 2;
        *mulsize = 2;
        break;
    case 3: /* 1 2 4 8 16 32 64 ... */
        *repfirst = 1;
        *firstsize = 1;
        *repnext = 1;
        *mulsize = 2;
        break;
    case 5:                     /* 1 4 16 64 256 1024 4096 16384 65536 */
        *repfirst = 1;
        *firstsize = 1;
        *repnext = 1;
        *mulsize = 4;
        break;
    case 0: /* obsolete */
    case 2: /* obsolete */
    case 4: /* obsolete */
        encoder->usr->error(encoder->usr, "findmodelparams(): evol value obsolete!!!\n");
    default:
        encoder->usr->error(encoder->usr, "findmodelparams(): evol out of range!!!\n");
    }

    *nbuckets = 0;
    repcntr = *repfirst + 1;    /* first bucket */
    bsize = *firstsize;

    do { /* other buckets */
        if (nbuckets) {         /* bucket start */
            bstart = bend + 1;
        } else {
            bstart = 0;
        }

        if (!--repcntr) {         /* bucket size */
            repcntr = *repnext;
            bsize *= *mulsize;
        }

        bend = bstart + bsize - 1;  /* bucket end */
        if (bend + bsize >= *levels) {  /* if following bucked was bigger than current one */
            bend = *levels - 1;     /* concatenate them */
        }

        if (!*n_buckets_ptrs) {           /* array size not set yet? */
            *n_buckets_ptrs = *levels;
 #if 0
            if (bend == *levels - 1) {   /* this bucket is last - all in the first array */
                *n_buckets_ptrs = *levels;
            } else if (bsize >= 256) { /* this bucket is allowed to reside in the 2nd table */
                b_lo_ptrs = bstart;
                assert(bstart);     /* previous bucket exists */
            }
 #endif
        }

        (*nbuckets)++;
    } while (bend < *levels - 1);
}

static int init_model_structures(Encoder *encoder, FamilyStat *family_stat,
                                 unsigned int rep_first, unsigned int first_size,
                                 unsigned int rep_next, unsigned int mul_size,
                                 unsigned int levels, unsigned int ncounters,
                                 unsigned int n_buckets_ptrs, unsigned int n_buckets)
{
    family_stat->buckets_ptrs = (s_bucket **)encoder->usr->malloc(encoder->usr,
                                                                  n_buckets_ptrs *
                                                                  sizeof(s_bucket *));
    if (!family_stat->buckets_ptrs) {
        return FALSE;
    }

    family_stat->counters = (COUNTER *)encoder->usr->malloc(encoder->usr,
                                                            n_buckets * sizeof(COUNTER) *
                                                            MAXNUMCODES);
    if (!family_stat->counters) {
        goto error_1;
    }

    family_stat->buckets_buf = (s_bucket *)encoder->usr->malloc(encoder->usr,
                                                                n_buckets * sizeof(s_bucket));
    if (!family_stat->buckets_buf) {
        goto error_2;
    }

    fill_model_structures(encoder, family_stat, rep_first, first_size, rep_next, mul_size, levels,
                          ncounters, n_buckets, n_buckets_ptrs);

    return TRUE;

error_2:
    encoder->usr->free(encoder->usr, family_stat->counters);

error_1:
    encoder->usr->free(encoder->usr, family_stat->buckets_ptrs);

    return FALSE;
}

static void free_family_stat(QuicUsrContext *usr, FamilyStat *family_stat)
{
    usr->free(usr, family_stat->buckets_ptrs);
    usr->free(usr, family_stat->counters);
    usr->free(usr, family_stat->buckets_buf);
}

static int init_channel(Encoder *encoder, Channel *channel)
{
    unsigned int ncounters;
    unsigned int levels;
    unsigned int rep_first;
    unsigned int first_size;
    unsigned int rep_next;
    unsigned int mul_size;
    unsigned int n_buckets;
    unsigned int n_buckets_ptrs;

    channel->encoder = encoder;
    channel->state.encoder = encoder;
    channel->correlate_row_width = 0;
    channel->correlate_row = NULL;

    find_model_params(encoder, 8, &ncounters, &levels, &n_buckets_ptrs, &rep_first,
                      &first_size, &rep_next, &mul_size, &n_buckets);
    encoder->n_buckets_8bpc = n_buckets;
    if (!init_model_structures(encoder, &channel->family_stat_8bpc, rep_first, first_size,
                               rep_next, mul_size, levels, ncounters, n_buckets_ptrs,
                               n_buckets)) {
        return FALSE;
    }

    find_model_params(encoder, 5, &ncounters, &levels, &n_buckets_ptrs, &rep_first,
                      &first_size, &rep_next, &mul_size, &n_buckets);
    encoder->n_buckets_5bpc = n_buckets;
    if (!init_model_structures(encoder, &channel->family_stat_5bpc, rep_first, first_size,
                               rep_next, mul_size, levels, ncounters, n_buckets_ptrs,
                               n_buckets)) {
        free_family_stat(encoder->usr, &channel->family_stat_8bpc);
        return FALSE;
    }

    return TRUE;
}

static void destroy_channel(Channel *channel)
{
    QuicUsrContext *usr = channel->encoder->usr;
    if (channel->correlate_row) {
        usr->free(usr, channel->correlate_row - 1);
    }
    free_family_stat(usr, &channel->family_stat_8bpc);
    free_family_stat(usr, &channel->family_stat_5bpc);
}

static int init_encoder(Encoder *encoder, QuicUsrContext *usr)
{
    int i;

    encoder->usr = usr;
    encoder->rgb_state.encoder = encoder;

    for (i = 0; i < MAX_CHANNELS; i++) {
        if (!init_channel(encoder, &encoder->channels[i])) {
            for (--i; i >= 0; i--) {
                destroy_channel(&encoder->channels[i]);
            }
            return FALSE;
        }
    }
    return TRUE;
}

static int encoder_reste(Encoder *encoder, uint32_t *io_ptr, uint32_t *io_ptr_end)
{
    ASSERT(encoder->usr, ((unsigned long)io_ptr % 4) == ((unsigned long)io_ptr_end % 4));
    ASSERT(encoder->usr, io_ptr <= io_ptr_end);

    encoder->rgb_state.waitcnt = 0;
    encoder->rgb_state.tabrand_seed = stabrand();
    encoder->rgb_state.wmidx = DEFwmistart;
    encoder->rgb_state.wmileft = wminext;
    set_wm_trigger(&encoder->rgb_state);

#if defined(RLE) && defined(RLE_STAT)
    encoder_init_rle(&encoder->rgb_state);
#endif

    encoder->io_words_count = (uint32_t)(io_ptr_end - io_ptr);
    encoder->io_now = io_ptr;
    encoder->io_end = io_ptr_end;
    encoder->rows_completed = 0;

    return TRUE;
}

static int encoder_reste_channels(Encoder *encoder, int channels, int width, int bpc)
{
    int i;

    encoder->num_channels = channels;

    for (i = 0; i < channels; i++) {
        s_bucket *bucket;
        s_bucket *end_bucket;

        if (encoder->channels[i].correlate_row_width < width) {
            encoder->channels[i].correlate_row_width = 0;
            if (encoder->channels[i].correlate_row) {
                encoder->usr->free(encoder->usr, encoder->channels[i].correlate_row - 1);
            }
            if (!(encoder->channels[i].correlate_row = (BYTE *)encoder->usr->malloc(encoder->usr,
                                                                                    width + 1))) {
                return FALSE;
            }
            encoder->channels[i].correlate_row++;
            encoder->channels[i].correlate_row_width = width;
        }

        if (bpc == 8) {
            MEMCLEAR(encoder->channels[i].family_stat_8bpc.counters,
                     encoder->n_buckets_8bpc * sizeof(COUNTER) * MAXNUMCODES);
            bucket = encoder->channels[i].family_stat_8bpc.buckets_buf;
            end_bucket = bucket + encoder->n_buckets_8bpc;
            for (; bucket < end_bucket; bucket++) {
                bucket->bestcode = /*BPC*/ 8 - 1;
            }
            encoder->channels[i]._buckets_ptrs = encoder->channels[i].family_stat_8bpc.buckets_ptrs;
        } else if (bpc == 5) {
            MEMCLEAR(encoder->channels[i].family_stat_5bpc.counters,
                     encoder->n_buckets_5bpc * sizeof(COUNTER) * MAXNUMCODES);
            bucket = encoder->channels[i].family_stat_5bpc.buckets_buf;
            end_bucket = bucket + encoder->n_buckets_5bpc;
            for (; bucket < end_bucket; bucket++) {
                bucket->bestcode = /*BPC*/ 5 - 1;
            }
            encoder->channels[i]._buckets_ptrs = encoder->channels[i].family_stat_5bpc.buckets_ptrs;
        } else {
            encoder->usr->warn(encoder->usr, "%s: bad bpc %d\n", __FUNCTION__, bpc);
            return FALSE;
        }

        encoder->channels[i].state.waitcnt = 0;
        encoder->channels[i].state.tabrand_seed = stabrand();
        encoder->channels[i].state.wmidx = DEFwmistart;
        encoder->channels[i].state.wmileft = wminext;
        set_wm_trigger(&encoder->channels[i].state);

#if defined(RLE) && defined(RLE_STAT)
        encoder_init_rle(&encoder->channels[i].state);
#endif
    }
    return TRUE;
}

static void quic_image_params(Encoder *encoder, QuicImageType type, int *channels, int *bpc)
{
    ASSERT(encoder->usr, channels && bpc);
    switch (type) {
    case QUIC_IMAGE_TYPE_GRAY:
        *channels = 1;
        *bpc = 8;
        break;
    case QUIC_IMAGE_TYPE_RGB16:
        *channels = 3;
        *bpc = 5;
#ifndef QUIC_RGB
        encoder->usr->error(encoder->usr, "not implemented\n");
#endif
        break;
    case QUIC_IMAGE_TYPE_RGB24:
        *channels = 3;
        *bpc = 8;
        break;
    case QUIC_IMAGE_TYPE_RGB32:
        *channels = 3;
        *bpc = 8;
        break;
    case QUIC_IMAGE_TYPE_RGBA:
        *channels = 4;
        *bpc = 8;
        break;
    case QUIC_IMAGE_TYPE_INVALID:
    default:
        *channels = 0;
        *bpc = 0;
        encoder->usr->error(encoder->usr, "bad image type\n");
    }
}

#define FILL_LINES() {                                                  \
    if (line == lines_end) {                                            \
        int n = encoder->usr->more_lines(encoder->usr, &line);          \
        if (n <= 0) {                                                   \
            encoder->usr->error(encoder->usr, "more lines failed\n");   \
        }                                                               \
        lines_end = line + n * stride;                                  \
    }                                                                   \
}

#define NEXT_LINE() {       \
    line += stride;         \
    FILL_LINES();           \
}

#define QUIC_COMPRESS_RGB(bits)                                                                 \
        encoder->channels[0].correlate_row[-1] = 0;                                             \
        encoder->channels[1].correlate_row[-1] = 0;                                             \
        encoder->channels[2].correlate_row[-1] = 0;                                             \
        quic_rgb##bits##_compress_row0(encoder, (rgb##bits##_pixel_t *)(line), width);          \
        encoder->rows_completed++;                                                              \
        for (row = 1; row < height; row++) {                                                    \
            prev = line;                                                                        \
            NEXT_LINE();                                                                        \
            encoder->channels[0].correlate_row[-1] = encoder->channels[0].correlate_row[0];     \
            encoder->channels[1].correlate_row[-1] = encoder->channels[1].correlate_row[0];     \
            encoder->channels[2].correlate_row[-1] = encoder->channels[2].correlate_row[0];     \
            quic_rgb##bits##_compress_row(encoder, (rgb##bits##_pixel_t *)prev,                 \
                                          (rgb##bits##_pixel_t *)line, width);                  \
            encoder->rows_completed++;                                                          \
        }

int quic_encode(QuicContext *quic, QuicImageType type, int width, int height,
                uint8_t *line, unsigned int num_lines, int stride,
                uint32_t *io_ptr, unsigned int num_io_words)
{
    Encoder *encoder = (Encoder *)quic;
    uint32_t *io_ptr_end = io_ptr + num_io_words;
    uint8_t *lines_end;
    int row;
    uint8_t *prev;
    int channels;
    int bpc;
#ifndef QUIC_RGB
    int i;
#endif

    ASSERT(encoder->usr, line);
    lines_end = line + num_lines * stride;

    quic_image_params(encoder, type, &channels, &bpc);

    if (!encoder_reste(encoder, io_ptr, io_ptr_end) ||
        !encoder_reste_channels(encoder, channels, width, bpc)) {
        return QUIC_ERROR;
    }

    encoder->io_word = 0;
    encoder->io_available_bits = 32;

    encode_32(encoder, QUIC_MAGIC);
    encode_32(encoder, QUIC_VERSION);
    encode_32(encoder, type);
    encode_32(encoder, width);
    encode_32(encoder, height);

    FILL_LINES();

    switch (type) {
#ifdef QUIC_RGB
    case QUIC_IMAGE_TYPE_RGB32:
        ASSERT(encoder->usr, ABS(stride) >= width * 4);
        QUIC_COMPRESS_RGB(32);
        break;
    case QUIC_IMAGE_TYPE_RGB24:
        ASSERT(encoder->usr, ABS(stride) >= width * 3);
        QUIC_COMPRESS_RGB(24);
        break;
    case QUIC_IMAGE_TYPE_RGB16:
        ASSERT(encoder->usr, ABS(stride) >= width * 2);
        QUIC_COMPRESS_RGB(16);
        break;
    case QUIC_IMAGE_TYPE_RGBA:
        ASSERT(encoder->usr, ABS(stride) >= width * 4);

        encoder->channels[0].correlate_row[-1] = 0;
        encoder->channels[1].correlate_row[-1] = 0;
        encoder->channels[2].correlate_row[-1] = 0;
        quic_rgb32_compress_row0(encoder, (rgb32_pixel_t *)(line), width);

        encoder->channels[3].correlate_row[-1] = 0;
        quic_four_compress_row0(encoder, &encoder->channels[3], (four_bytes_t *)(line + 3), width);

        encoder->rows_completed++;

        for (row = 1; row < height; row++) {
            prev = line;
            NEXT_LINE();
            encoder->channels[0].correlate_row[-1] = encoder->channels[0].correlate_row[0];
            encoder->channels[1].correlate_row[-1] = encoder->channels[1].correlate_row[0];
            encoder->channels[2].correlate_row[-1] = encoder->channels[2].correlate_row[0];
            quic_rgb32_compress_row(encoder, (rgb32_pixel_t *)prev, (rgb32_pixel_t *)line, width);

            encoder->channels[3].correlate_row[-1] = encoder->channels[3].correlate_row[0];
            quic_four_compress_row(encoder, &encoder->channels[3], (four_bytes_t *)(prev + 3),
                                   (four_bytes_t *)(line + 3), width);
            encoder->rows_completed++;
        }
        break;
#else
    case QUIC_IMAGE_TYPE_RGB24:
        ASSERT(encoder->usr, ABS(stride) >= width * 3);
        for (i = 0; i < 3; i++) {
            encoder->channels[i].correlate_row[-1] = 0;
            quic_three_compress_row0(encoder, &encoder->channels[i], (three_bytes_t *)(line + i),
                                     width);
        }
        encoder->rows_completed++;
        for (row = 1; row < height; row++) {
            prev = line;
            NEXT_LINE();
            for (i = 0; i < 3; i++) {
                encoder->channels[i].correlate_row[-1] = encoder->channels[i].correlate_row[0];
                quic_three_compress_row(encoder, &encoder->channels[i], (three_bytes_t *)(prev + i),
                                        (three_bytes_t *)(line + i), width);
            }
            encoder->rows_completed++;
        }
        break;
    case QUIC_IMAGE_TYPE_RGB32:
    case QUIC_IMAGE_TYPE_RGBA:
        ASSERT(encoder->usr, ABS(stride) >= width * 4);
        for (i = 0; i < channels; i++) {
            encoder->channels[i].correlate_row[-1] = 0;
            quic_four_compress_row0(encoder, &encoder->channels[i], (four_bytes_t *)(line + i),
                                    width);
        }
        encoder->rows_completed++;
        for (row = 1; row < height; row++) {
            prev = line;
            NEXT_LINE();
            for (i = 0; i < channels; i++) {
                encoder->channels[i].correlate_row[-1] = encoder->channels[i].correlate_row[0];
                quic_four_compress_row(encoder, &encoder->channels[i], (four_bytes_t *)(prev + i),
                                       (four_bytes_t *)(line + i), width);
            }
            encoder->rows_completed++;
        }
        break;
#endif
    case QUIC_IMAGE_TYPE_GRAY:
        ASSERT(encoder->usr, ABS(stride) >= width);
        encoder->channels[0].correlate_row[-1] = 0;
        quic_one_compress_row0(encoder, &encoder->channels[0], (one_byte_t *)line, width);
        encoder->rows_completed++;
        for (row = 1; row < height; row++) {
            prev = line;
            NEXT_LINE();
            encoder->channels[0].correlate_row[-1] = encoder->channels[0].correlate_row[0];
            quic_one_compress_row(encoder, &encoder->channels[0], (one_byte_t *)prev,
                                  (one_byte_t *)line, width);
            encoder->rows_completed++;
        }
        break;
    case QUIC_IMAGE_TYPE_INVALID:
    default:
        encoder->usr->error(encoder->usr, "bad image type\n");
    }

    flush(encoder);
    encoder->io_words_count -= (uint32_t)(encoder->io_end - encoder->io_now);

    return encoder->io_words_count;
}

int quic_decode_begin(QuicContext *quic, uint32_t *io_ptr, unsigned int num_io_words,
                      QuicImageType *out_type, int *out_width, int *out_height)
{
    Encoder *encoder = (Encoder *)quic;
    uint32_t *io_ptr_end = io_ptr + num_io_words;
    QuicImageType type;
    int width;
    int height;
    uint32_t magic;
    uint32_t version;
    int channels;
    int bpc;

    if (!encoder_reste(encoder, io_ptr, io_ptr_end)) {
        return QUIC_ERROR;
    }

    init_decode_io(encoder);

    magic = encoder->io_word;
    decode_eat32bits(encoder);
    if (magic != QUIC_MAGIC) {
        encoder->usr->warn(encoder->usr, "bad magic\n");
        return QUIC_ERROR;
    }

    version = encoder->io_word;
    decode_eat32bits(encoder);
    if (version != QUIC_VERSION) {
        encoder->usr->warn(encoder->usr, "bad version\n");
        return QUIC_ERROR;
    }

    type = (QuicImageType)encoder->io_word;
    decode_eat32bits(encoder);

    width = encoder->io_word;
    decode_eat32bits(encoder);

    height = encoder->io_word;
    decode_eat32bits(encoder);

    quic_image_params(encoder, type, &channels, &bpc);

    if (!encoder_reste_channels(encoder, channels, width, bpc)) {
        return QUIC_ERROR;
    }

    *out_width = encoder->width = width;
    *out_height = encoder->height = height;
    *out_type = encoder->type = type;
    return QUIC_OK;
}

#ifndef QUIC_RGB
static void clear_row(four_bytes_t *row, int width)
{
    four_bytes_t *end;
    for (end = row + width; row < end; row++) {
        row->a = 0;
    }
}

#endif

#ifdef QUIC_RGB

static void uncompress_rgba(Encoder *encoder, uint8_t *buf, int stride)
{
    unsigned int row;
    uint8_t *prev;

    encoder->channels[0].correlate_row[-1] = 0;
    encoder->channels[1].correlate_row[-1] = 0;
    encoder->channels[2].correlate_row[-1] = 0;
    quic_rgb32_uncompress_row0(encoder, (rgb32_pixel_t *)buf, encoder->width);

    encoder->channels[3].correlate_row[-1] = 0;
    quic_four_uncompress_row0(encoder, &encoder->channels[3], (four_bytes_t *)(buf + 3),
                              encoder->width);

    encoder->rows_completed++;
    for (row = 1; row < encoder->height; row++) {
        prev = buf;
        buf += stride;

        encoder->channels[0].correlate_row[-1] = encoder->channels[0].correlate_row[0];
        encoder->channels[1].correlate_row[-1] = encoder->channels[1].correlate_row[0];
        encoder->channels[2].correlate_row[-1] = encoder->channels[2].correlate_row[0];
        quic_rgb32_uncompress_row(encoder, (rgb32_pixel_t *)prev, (rgb32_pixel_t *)buf,
                                  encoder->width);

        encoder->channels[3].correlate_row[-1] = encoder->channels[3].correlate_row[0];
        quic_four_uncompress_row(encoder, &encoder->channels[3], (four_bytes_t *)(prev + 3),
                                 (four_bytes_t *)(buf + 3), encoder->width);

        encoder->rows_completed++;
    }
}

#endif

static void uncompress_gray(Encoder *encoder, uint8_t *buf, int stride)
{
    unsigned int row;
    uint8_t *prev;

    encoder->channels[0].correlate_row[-1] = 0;
    quic_one_uncompress_row0(encoder, &encoder->channels[0], (one_byte_t *)buf, encoder->width);
    encoder->rows_completed++;
    for (row = 1; row < encoder->height; row++) {
        prev = buf;
        buf += stride;
        encoder->channels[0].correlate_row[-1] = encoder->channels[0].correlate_row[0];
        quic_one_uncompress_row(encoder, &encoder->channels[0], (one_byte_t *)prev,
                                (one_byte_t *)buf, encoder->width);
        encoder->rows_completed++;
    }
}

#define QUIC_UNCOMPRESS_RGB(prefix, type)                                                       \
        encoder->channels[0].correlate_row[-1] = 0;                                             \
        encoder->channels[1].correlate_row[-1] = 0;                                             \
        encoder->channels[2].correlate_row[-1] = 0;                                             \
        quic_rgb##prefix##_uncompress_row0(encoder, (type *)buf, encoder->width);  \
        encoder->rows_completed++;                                                              \
        for (row = 1; row < encoder->height; row++) {                                           \
            prev = buf;                                                                         \
            buf += stride;                                                                      \
            encoder->channels[0].correlate_row[-1] = encoder->channels[0].correlate_row[0];     \
            encoder->channels[1].correlate_row[-1] = encoder->channels[1].correlate_row[0];     \
            encoder->channels[2].correlate_row[-1] = encoder->channels[2].correlate_row[0];     \
            quic_rgb##prefix##_uncompress_row(encoder, (type *)prev, (type *)buf,               \
                                              encoder->width);                                  \
            encoder->rows_completed++;                                                          \
        }
        
int quic_decode(QuicContext *quic, QuicImageType type, uint8_t *buf, int stride)
{
    Encoder *encoder = (Encoder *)quic;
    unsigned int row;
    uint8_t *prev;
#ifndef QUIC_RGB
    int i;
#endif

    ASSERT(encoder->usr, buf);

    switch (encoder->type) {
#ifdef QUIC_RGB
    case QUIC_IMAGE_TYPE_RGB32:
    case QUIC_IMAGE_TYPE_RGB24:
        if (type == QUIC_IMAGE_TYPE_RGB32) {
            ASSERT(encoder->usr, ABS(stride) >= (int)encoder->width * 4);
            QUIC_UNCOMPRESS_RGB(32, rgb32_pixel_t);
            break;
        } else if (type == QUIC_IMAGE_TYPE_RGB24) {
            ASSERT(encoder->usr, ABS(stride) >= (int)encoder->width * 3);
            QUIC_UNCOMPRESS_RGB(24, rgb24_pixel_t);
            break;
        }
        encoder->usr->warn(encoder->usr, "unsupported output format\n");
        return QUIC_ERROR;
    case QUIC_IMAGE_TYPE_RGB16:
        if (type == QUIC_IMAGE_TYPE_RGB16) {
            ASSERT(encoder->usr, ABS(stride) >= (int)encoder->width * 2);
            QUIC_UNCOMPRESS_RGB(16, rgb16_pixel_t);
        } else if (type == QUIC_IMAGE_TYPE_RGB32) {
            ASSERT(encoder->usr, ABS(stride) >= (int)encoder->width * 4);
            QUIC_UNCOMPRESS_RGB(16_to_32, rgb32_pixel_t);
        } else {
            encoder->usr->warn(encoder->usr, "unsupported output format\n");
            return QUIC_ERROR;
        }

        break;
    case QUIC_IMAGE_TYPE_RGBA:

        if (type != QUIC_IMAGE_TYPE_RGBA) {
            encoder->usr->warn(encoder->usr, "unsupported output format\n");
            return QUIC_ERROR;
        }
        ASSERT(encoder->usr, ABS(stride) >= (int)encoder->width * 4);
        uncompress_rgba(encoder, buf, stride);
        break;
#else
    case QUIC_IMAGE_TYPE_RGB24:
        ASSERT(encoder->usr, ABS(stride) >= (int)encoder->width * 3);
        for (i = 0; i < 3; i++) {
            encoder->channels[i].correlate_row[-1] = 0;
            quic_three_uncompress_row0(encoder, &encoder->channels[i], (three_bytes_t *)(buf + i),
                                       encoder->width);
        }
        encoder->rows_completed++;
        for (row = 1; row < encoder->height; row++) {
            prev = buf;
            buf += stride;
            for (i = 0; i < 3; i++) {
                encoder->channels[i].correlate_row[-1] = encoder->channels[i].correlate_row[0];
                quic_three_uncompress_row(encoder, &encoder->channels[i],
                                          (three_bytes_t *)(prev + i),
                                          (three_bytes_t *)(buf + i),
                                          encoder->width);
            }
            encoder->rows_completed++;
        }
        break;
    case QUIC_IMAGE_TYPE_RGB32:
        ASSERT(encoder->usr, ABS(stride) >= encoder->width * 4);
        for (i = 0; i < 3; i++) {
            encoder->channels[i].correlate_row[-1] = 0;
            quic_four_uncompress_row0(encoder, &encoder->channels[i], (four_bytes_t *)(buf + i),
                                      encoder->width);
        }
        clear_row((four_bytes_t *)(buf + 3), encoder->width);
        encoder->rows_completed++;
        for (row = 1; row < encoder->height; row++) {
            prev = buf;
            buf += stride;
            for (i = 0; i < 3; i++) {
                encoder->channels[i].correlate_row[-1] = encoder->channels[i].correlate_row[0];
                quic_four_uncompress_row(encoder, &encoder->channels[i],
                                         (four_bytes_t *)(prev + i),
                                         (four_bytes_t *)(buf + i),
                                         encoder->width);
            }
            clear_row((four_bytes_t *)(buf + 3), encoder->width);
            encoder->rows_completed++;
        }
        break;
    case QUIC_IMAGE_TYPE_RGBA:
        ASSERT(encoder->usr, ABS(stride) >= encoder->width * 4);
        for (i = 0; i < 4; i++) {
            encoder->channels[i].correlate_row[-1] = 0;
            quic_four_uncompress_row0(encoder, &encoder->channels[i], (four_bytes_t *)(buf + i),
                                      encoder->width);
        }
        encoder->rows_completed++;
        for (row = 1; row < encoder->height; row++) {
            prev = buf;
            buf += stride;
            for (i = 0; i < 4; i++) {
                encoder->channels[i].correlate_row[-1] = encoder->channels[i].correlate_row[0];
                quic_four_uncompress_row(encoder, &encoder->channels[i],
                                         (four_bytes_t *)(prev + i),
                                         (four_bytes_t *)(buf + i),
                                         encoder->width);
            }
            encoder->rows_completed++;
        }
        break;
#endif
    case QUIC_IMAGE_TYPE_GRAY:

        if (type != QUIC_IMAGE_TYPE_GRAY) {
            encoder->usr->warn(encoder->usr, "unsupported output format\n");
            return QUIC_ERROR;
        }
        ASSERT(encoder->usr, ABS(stride) >= (int)encoder->width);
        uncompress_gray(encoder, buf, stride);
        break;
    case QUIC_IMAGE_TYPE_INVALID:
    default:
        encoder->usr->error(encoder->usr, "bad image type\n");
    }
    return QUIC_OK;
}

static int need_init = TRUE;

QuicContext *quic_create(QuicUsrContext *usr)
{
    Encoder *encoder;

    if (!usr || need_init || !usr->error || !usr->warn || !usr->info || !usr->malloc ||
        !usr->free || !usr->more_space || !usr->more_lines) {
        return NULL;
    }

    if (!(encoder = (Encoder *)usr->malloc(usr, sizeof(Encoder)))) {
        return NULL;
    }

    if (!init_encoder(encoder, usr)) {
        usr->free(usr, encoder);
        return NULL;
    }
    return (QuicContext *)encoder;
}

void quic_destroy(QuicContext *quic)
{
    Encoder *encoder = (Encoder *)quic;
    int i;

    if (!quic) {
        return;
    }

    for (i = 0; i < MAX_CHANNELS; i++) {
        destroy_channel(&encoder->channels[i]);
    }
    encoder->usr->free(encoder->usr, encoder);
}

void quic_init()
{
    if (!need_init) {
        return;
    }
    need_init = FALSE;

    family_init(&family_8bpc, 8, DEFmaxclen);
    family_init(&family_5bpc, 5, DEFmaxclen);
#if defined(RLE) && defined(RLE_STAT)
    init_zeroLUT();
#endif
}

